From a5ab702e1c8f3003122196080ececfe39ace15a1 Mon Sep 17 00:00:00 2001 From: "kaf24@camelot.eng.3leafnetworks.com" Date: Sat, 4 Sep 2004 18:32:45 +0000 Subject: [PATCH] bitkeeper revision 1.1159.69.9 (413a0a4d7ODoJ_4kti38jM2_4EooQA) Cset exclude: kaf24@camelot.eng.3leafnetworks.com|ChangeSet|20040903222602|39015 --- .../include/asm-xen/pgalloc.h | 2 + .../arch/xen/i386/mm/hypervisor.c | 50 +++++++++++- .../include/asm-xen/hypervisor.h | 12 +++ tools/libxc/xc_linux_build.c | 2 +- tools/libxc/xc_linux_restore.c | 30 ++++--- tools/libxc/xc_netbsd_build.c | 2 +- xen/arch/x86/memory.c | 45 ++++++----- xen/common/schedule.c | 14 ++++ xen/include/asm-x86/mm.h | 81 +++++++++++++++++-- xen/include/hypervisor-ifs/hypervisor-if.h | 11 ++- 10 files changed, 208 insertions(+), 41 deletions(-) diff --git a/linux-2.4.27-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.27-xen-sparse/include/asm-xen/pgalloc.h index be678385f3..f6bee4d689 100644 --- a/linux-2.4.27-xen-sparse/include/asm-xen/pgalloc.h +++ b/linux-2.4.27-xen-sparse/include/asm-xen/pgalloc.h @@ -134,6 +134,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) { clear_page(pte); __make_page_readonly(pte); + queue_pte_pin(__pa(pte)); } return pte; @@ -152,6 +153,7 @@ static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, static __inline__ void pte_free_slow(pte_t *pte) { + queue_pte_unpin(__pa(pte)); __make_page_writable(pte); free_page((unsigned long)pte); } diff --git a/linux-2.6.8.1-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.8.1-xen-sparse/arch/xen/i386/mm/hypervisor.c index 03158c3ee4..a57eabcd8c 100644 --- a/linux-2.6.8.1-xen-sparse/arch/xen/i386/mm/hypervisor.c +++ b/linux-2.6.8.1-xen-sparse/arch/xen/i386/mm/hypervisor.c @@ -85,6 +85,8 @@ static void DEBUG_disallow_pt_read(unsigned long va) #undef queue_invlpg #undef queue_pgd_pin #undef queue_pgd_unpin +#undef queue_pte_pin +#undef queue_pte_unpin #undef queue_set_ldt #endif @@ -217,7 +219,7 @@ void queue_pgd_pin(unsigned long ptr) spin_lock_irqsave(&update_lock, flags); update_queue[idx].ptr = phys_to_machine(ptr); update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; - update_queue[idx].val = MMUEXT_PIN_TABLE; + update_queue[idx].val = MMUEXT_PIN_L2_TABLE; increment_index(); spin_unlock_irqrestore(&update_lock, flags); } @@ -233,6 +235,28 @@ void queue_pgd_unpin(unsigned long ptr) spin_unlock_irqrestore(&update_lock, flags); } +void queue_pte_pin(unsigned long ptr) +{ + unsigned long flags; + spin_lock_irqsave(&update_lock, flags); + update_queue[idx].ptr = phys_to_machine(ptr); + update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; + update_queue[idx].val = MMUEXT_PIN_L1_TABLE; + increment_index(); + spin_unlock_irqrestore(&update_lock, flags); +} + +void queue_pte_unpin(unsigned long ptr) +{ + unsigned long flags; + spin_lock_irqsave(&update_lock, flags); + update_queue[idx].ptr = phys_to_machine(ptr); + update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; + update_queue[idx].val = MMUEXT_UNPIN_TABLE; + increment_index(); + spin_unlock_irqrestore(&update_lock, flags); +} + void queue_set_ldt(unsigned long ptr, unsigned long len) { unsigned long flags; @@ -315,7 +339,7 @@ void xen_pgd_pin(unsigned long ptr) spin_lock_irqsave(&update_lock, flags); update_queue[idx].ptr = phys_to_machine(ptr); update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; - update_queue[idx].val = MMUEXT_PIN_TABLE; + update_queue[idx].val = MMUEXT_PIN_L2_TABLE; increment_index_and_flush(); spin_unlock_irqrestore(&update_lock, flags); } @@ -331,6 +355,28 @@ void xen_pgd_unpin(unsigned long ptr) spin_unlock_irqrestore(&update_lock, flags); } +void xen_pte_pin(unsigned long ptr) +{ + unsigned long flags; + spin_lock_irqsave(&update_lock, flags); + update_queue[idx].ptr = phys_to_machine(ptr); + update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; + update_queue[idx].val = MMUEXT_PIN_L1_TABLE; + increment_index_and_flush(); + spin_unlock_irqrestore(&update_lock, flags); +} + +void xen_pte_unpin(unsigned long ptr) +{ + unsigned long flags; + spin_lock_irqsave(&update_lock, flags); + update_queue[idx].ptr = phys_to_machine(ptr); + update_queue[idx].ptr |= MMU_EXTENDED_COMMAND; + update_queue[idx].val = MMUEXT_UNPIN_TABLE; + increment_index_and_flush(); + spin_unlock_irqrestore(&update_lock, flags); +} + void xen_set_ldt(unsigned long ptr, unsigned long len) { unsigned long flags; diff --git a/linux-2.6.8.1-xen-sparse/include/asm-xen/hypervisor.h b/linux-2.6.8.1-xen-sparse/include/asm-xen/hypervisor.h index 0de4075b28..4d7ddc55a7 100644 --- a/linux-2.6.8.1-xen-sparse/include/asm-xen/hypervisor.h +++ b/linux-2.6.8.1-xen-sparse/include/asm-xen/hypervisor.h @@ -54,6 +54,8 @@ void queue_tlb_flush(void); void queue_invlpg(unsigned long ptr); void queue_pgd_pin(unsigned long ptr); void queue_pgd_unpin(unsigned long ptr); +void queue_pte_pin(unsigned long ptr); +void queue_pte_unpin(unsigned long ptr); void queue_set_ldt(unsigned long ptr, unsigned long bytes); void queue_machphys_update(unsigned long mfn, unsigned long pfn); void xen_l1_entry_update(pte_t *ptr, unsigned long val); @@ -63,6 +65,8 @@ void xen_tlb_flush(void); void xen_invlpg(unsigned long ptr); void xen_pgd_pin(unsigned long ptr); void xen_pgd_unpin(unsigned long ptr); +void xen_pte_pin(unsigned long ptr); +void xen_pte_unpin(unsigned long ptr); void xen_set_ldt(unsigned long ptr, unsigned long bytes); void xen_machphys_update(unsigned long mfn, unsigned long pfn); #define MMU_UPDATE_DEBUG 0 @@ -137,6 +141,14 @@ extern page_update_debug_t update_debug_queue[]; printk("PGD UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ queue_pgd_unpin(_p); \ }) +#define queue_pte_pin(_p) ({ \ + printk("PTE PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pte_pin(_p); \ +}) +#define queue_pte_unpin(_p) ({ \ + printk("PTE UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pte_unpin(_p); \ +}) #define queue_set_ldt(_p,_l) ({ \ printk("SETL LDT %s %d: %08lx %d\n", __FILE__, __LINE__, (_p), (_l)); \ queue_set_ldt((_p), (_l)); \ diff --git a/tools/libxc/xc_linux_build.c b/tools/libxc/xc_linux_build.c index 1242334dad..41569b8057 100644 --- a/tools/libxc/xc_linux_build.c +++ b/tools/libxc/xc_linux_build.c @@ -270,7 +270,7 @@ static int setup_guestos(int xc_handle, * correct protection for the page */ if ( add_mmu_update(xc_handle, mmu, - l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_TABLE) ) + l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) ) goto error_out; start_info = map_pfn_writeable( diff --git a/tools/libxc/xc_linux_restore.c b/tools/libxc/xc_linux_restore.c index 5e4739f84b..fe7522e1ed 100644 --- a/tools/libxc/xc_linux_restore.c +++ b/tools/libxc/xc_linux_restore.c @@ -473,16 +473,28 @@ int xc_linux_restore(int xc_handle, XcIOContext *ioctxt) */ for ( i = 0; i < nr_pfns; i++ ) { - if ( pfn_type[i] != (L2TAB|LPINTAB) ) - continue; - if ( add_mmu_update(xc_handle, mmu, - (pfn_to_mfn_table[i]<> 2)) ) + ((unsigned long) + pl2e & ~PAGE_MASK) >> 2 )) ) return 0; if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) @@ -827,9 +826,21 @@ static int do_extended_command(unsigned long ptr, unsigned long val) switch ( cmd ) { - case MMUEXT_PIN_TABLE: + case MMUEXT_PIN_L1_TABLE: + case MMUEXT_PIN_L2_TABLE: + + /* When we pin an L1 page we now insist that the va + backpointer (used for writable page tables) must still be + mutable. This is an additional restriction even for guests + that don't use writable page tables, but I don't think it + will break anything as guests typically pin pages before + they are used, hence they'll still be mutable. */ + okay = get_page_and_type_from_pagenr( - pfn, PGT_l2_page_table, FOREIGNDOM); + pfn, + ((cmd==MMUEXT_PIN_L2_TABLE) ? + PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) , + FOREIGNDOM); if ( unlikely(!okay) ) { @@ -1184,7 +1195,6 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) unsigned long prev_spfn = 0; l1_pgentry_t *prev_spl1e = 0; struct domain *d = current; - u32 type_info; perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); @@ -1233,11 +1243,10 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) } page = &frame_table[pfn]; - switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) + switch ( (page->u.inuse.type_info & PGT_type_mask) ) { case PGT_l1_page_table: - if ( likely(get_page_type( - page, type_info & (PGT_type_mask|PGT_va_mask))) ) + if ( likely(passive_get_page_type(page, PGT_l1_page_table)) ) { okay = mod_l1_entry((l1_pgentry_t *)va, mk_l1_pgentry(req.val)); @@ -1487,11 +1496,11 @@ void ptwr_reconnect_disconnected(unsigned long addr) [ptwr_info[cpu].writable_l1>>PAGE_SHIFT]; #ifdef PTWR_TRACK_DOMAIN - if (ptwr_domain[cpu] != current->domain) + if (ptwr_domain[cpu] != get_current()->domain) printk("ptwr_reconnect_disconnected domain mismatch %d != %d\n", - ptwr_domain[cpu], current->domain); + ptwr_domain[cpu], get_current()->domain); #endif - PTWR_PRINTK(("[A] page fault in disconn space: addr %08lx space %08lx\n", + PTWR_PRINTK(("[A] page fault in disconnected space: addr %08lx space %08lx\n", addr, ptwr_info[cpu].disconnected << L2_PAGETABLE_SHIFT)); pl2e = &linear_l2_table[ptwr_info[cpu].disconnected]; @@ -1563,9 +1572,9 @@ void ptwr_flush_inactive(void) int i, idx; #ifdef PTWR_TRACK_DOMAIN - if (ptwr_info[cpu].domain != current->domain) + if (ptwr_info[cpu].domain != get_current()->domain) printk("ptwr_flush_inactive domain mismatch %d != %d\n", - ptwr_info[cpu].domain, current->domain); + ptwr_info[cpu].domain, get_current()->domain); #endif #if 0 { @@ -1646,9 +1655,9 @@ int ptwr_do_page_fault(unsigned long addr) if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) { #ifdef PTWR_TRACK_DOMAIN - if ( ptwr_info[cpu].domain != current->domain ) + if ( ptwr_info[cpu].domain != get_current()->domain ) printk("ptwr_do_page_fault domain mismatch %d != %d\n", - ptwr_info[cpu].domain, current->domain); + ptwr_info[cpu].domain, get_current()->domain); #endif pl2e = &linear_l2_table[(page->u.inuse.type_info & PGT_va_mask) >> PGT_va_shift]; diff --git a/xen/common/schedule.c b/xen/common/schedule.c index a986ee06a1..cc06d3c085 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -374,6 +374,20 @@ void __enter_scheduler(void) cleanup_writable_pagetable( prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE); +#ifdef PTWR_TRACK_DOMAIN + { + extern domid_t ptwr_domain[]; + int cpu = smp_processor_id(); + if (ptwr_domain[cpu] != prev->domain) + printk("switch_to domain mismatch %d != %d\n", + ptwr_domain[cpu], prev->domain); + ptwr_domain[cpu] = next->domain; + if (ptwr_disconnected[cpu] != ENTRIES_PER_L2_PAGETABLE || + ptwr_writable_idx[cpu]) + printk("switch_to ptwr dirty!!!\n"); + } +#endif + perfc_incrc(sched_ctx); #if defined(WAKE_HISTO) diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index b60e2e5f42..05813d64b7 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -71,9 +71,10 @@ struct pfn_info /* Has this page been validated for use as its current type? */ #define _PGT_validated 28 #define PGT_validated (1<<_PGT_validated) - /* The 10 most significant bits of virt address if this is a L1 page table. */ + /* 10-bit most significant bits of va address if used as l1 page table */ #define PGT_va_shift 18 #define PGT_va_mask (((1<<10)-1)<count_info)) ) + { + /* if the page is pinned, but we're dropping the last reference + then make the va backpointer mutable again */ + nx |= PGT_va_mutable; + } } while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); } @@ -222,15 +230,27 @@ static inline int get_page_type(struct pfn_info *page, u32 type) nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); nx |= type; /* No extra validation needed for writable pages. */ - if ( type == PGT_writable_page ) + if ( (type & PGT_type_mask) == PGT_writable_page ) nx |= PGT_validated; } } - else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) + else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) { - DPRINTK("Unexpected type or va backptr (saw %08x != exp %08x) " - "for pfn %08lx\n", - x & (PGT_type_mask|PGT_va_mask), type, page_to_pfn(page)); + DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( (x & PGT_va_mask) == PGT_va_mutable ) + { + /* The va_backpointer is currently mutable, hence we update it. */ + nx &= ~PGT_va_mask; + nx |= type; /* we know the actual type is correct */ + } + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) ) + { + /* The va backpointer wasn't mutable, and is different :-( */ + DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n", + x, type, page_to_pfn(page)); return 0; } else if ( unlikely(!(x & PGT_validated)) ) @@ -266,6 +286,55 @@ static inline int get_page_type(struct pfn_info *page, u32 type) return 1; } +/* This 'passive' version of get_page_type doesn't attempt to validate +the page, but just checks the type and increments the type count. The +function is called while doing a NORMAL_PT_UPDATE of an entry in an L1 +page table: We want to 'lock' the page for the brief beriod while +we're doing the update, but we're not actually linking it in to a +pagetable. */ + +static inline int passive_get_page_type(struct pfn_info *page, u32 type) +{ + u32 nx, x, y = page->u.inuse.type_info; + again: + do { + x = y; + nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); + return 0; + } + else if ( unlikely((x & PGT_count_mask) == 0) ) + { + if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) + { + nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); + nx |= type; + } + } + else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) + { + DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) != x ) + { + rep_nop(); + barrier(); + } + goto again; + } + } + while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + + return 1; +} + static inline void put_page_and_type(struct pfn_info *page) { diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index 2b9f8c7166..49cc4c46a3 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -104,9 +104,9 @@ * ptr[1:0] == MMU_EXTENDED_COMMAND: * val[7:0] -- MMUEXT_* command. * - * val[7:0] == MMUEXT_[UN]PIN_TABLE: - * ptr[:2] -- Machine address of frame to be (un)pinned as a top-level p.t. - * page. The frame must belong to the FD, if one is specified. + * val[7:0] == MMUEXT_(UN)PIN_*_TABLE: + * ptr[:2] -- Machine address of frame to be (un)pinned as a p.t. page. + * The frame must belong to the FD, if one is specified. * * val[7:0] == MMUEXT_NEW_BASEPTR: * ptr[:2] -- Machine address of new page-table base to install in MMU. @@ -145,7 +145,10 @@ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ #define MMU_EXTENDED_COMMAND 3 /* least 8 bits of val demux further */ -#define MMUEXT_PIN_TABLE 0 /* ptr = MA of frame to pin */ +#define MMUEXT_PIN_L1_TABLE 0 /* ptr = MA of frame to pin */ +#define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */ +#define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */ +#define MMUEXT_PIN_L4_TABLE 3 /* ptr = MA of frame to pin */ #define MMUEXT_UNPIN_TABLE 1 /* ptr = MA of frame to unpin */ #define MMUEXT_NEW_BASEPTR 2 /* ptr = MA of new pagetable base */ #define MMUEXT_TLB_FLUSH 3 /* ptr = NULL */ -- 2.30.2